-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[VPlan] Emit VPVectorEndPointerRecipe for reverse interleave pointer adjustment #144864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesThis patch introduces VPReverseInterleavePtrRecipe, a new recipe that adjusts the pointer of a reverse interleave group. It takes the pointer of member 0 and the VF as operands and computes the pointer to the last vector lane. The final goal is to support EVL tail folding for interleaved accesses. Given that VPInterleaveRecipe is large and tightly coupled — combining both load and store, and embedding operations like reverse pointer adjustion (GEP), widen load/store, deinterleave/interleave, and reversal — breaking it down into smaller, dedicated recipes may allow VPlanTransforms::tryAddExplicitVectorLength to lower them into EVL-aware form more effectively. One foreseeable challenge is that VPlanTransforms::convertToConcreteRecipes currently runs after tryAddExplicitVectorLength, so decomposing VPInterleaveRecipe will likely need to happen earlier in the pipeline to be effective. Full diff: https://github.com/llvm/llvm-project/pull/144864.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f887b34e76422..ce40c6ccba92e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,6 +4256,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPDerivedIVSC:
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
+ case VPDef::VPReverseInterleavePtrSC:
case VPDef::VPInstructionSC:
case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f3306ad7cb8ec..daef26fe86d79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,6 +531,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPReverseInterleavePtrSC:
case VPRecipeBase::VPMulAccumulateReductionSC:
case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
@@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1796,6 +1798,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
+class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
+ Type *IndexedTy;
+ unsigned Factor;
+
+public:
+ VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
+ unsigned Factor, GEPNoWrapFlags GEPFlags,
+ DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
+ ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
+ IndexedTy(IndexedTy), Factor(Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
+
+ VPValue *getPtr() const { return getOperand(0); }
+
+ VPValue *getVFValue() const { return getOperand(1); }
+
+ void execute(VPTransformState &State) override;
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override {
+ // TODO: Compute accurate cost after retiring the legacy cost model.
+ return 0;
+ }
+
+ VPReverseInterleavePtrRecipe *clone() override {
+ return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
+ Factor, getGEPNoWrapFlags(),
+ getDebugLoc());
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..98889cb5c520c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,9 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
+ [this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ed0b97849a8d..40dde8cfaea73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,6 +150,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
+ case VPReverseInterleavePtrSC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC:
@@ -2262,6 +2263,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
+ auto &Builder = State.Builder;
+ Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
+ Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
+ Type *IndexTy = Builder.getInt32Ty();
+ if (RuntimeVF->getType() != IndexTy)
+ RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
+ Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+ Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
+ Index = Builder.CreateNeg(Index);
+ Value *ReversePtr =
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
+
+ State.set(this, ReversePtr, /*IsScalar*/ true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << " = reverse-interleave-ptr";
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
+#endif
+
void VPBlendRecipe::execute(VPTransformState &State) {
assert(isNormalized() && "Expected blend to be normalized!");
// We know that all PHIs in non-header blocks are converted into
@@ -3223,25 +3251,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());
- // If the group is reverse, adjust the index to refer to the last vector lane
- // instead of the first. We adjust the index from the first vector lane,
- // rather than directly getting the pointer for lane VF - 1, because the
- // pointer operand of the interleaved access is supposed to be uniform.
- if (Group->isReverse()) {
- Value *RuntimeVF =
- getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- Value *Index =
- State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
- Index = State.Builder.CreateMul(Index,
- State.Builder.getInt32(Group->getFactor()));
- Index = State.Builder.CreateNeg(Index);
-
- bool InBounds = false;
- if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
- InBounds = Gep->isInBounds();
- ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
- }
-
State.setDebugLocFrom(getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 11f0f2a930329..6068b87663047 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2489,6 +2489,21 @@ void VPlanTransforms::createInterleaveGroups(
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
}
+ // If the group is reverse, adjust the index to refer to the last vector
+ // lane instead of the first. We adjust the index from the first vector
+ // lane, rather than directly getting the pointer for lane VF - 1, because
+ // the pointer operand of the interleaved access is supposed to be uniform.
+ if (IG->isReverse()) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
+ auto *ReversePtr = new VPReverseInterleavePtrRecipe(
+ Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+ GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+ : GEPNoWrapFlags::none(),
+ InsertPos->getDebugLoc());
+ ReversePtr->insertBefore(InsertPos);
+ Addr = ReversePtr;
+ }
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
VPIG->insertBefore(InsertPos);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..83f6ac223af1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,6 +335,7 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPReverseInterleavePtrSC,
VPMulAccumulateReductionSC,
VPExtendedReductionSC,
VPPartialReductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 7e4edf739695a..0333035a4b0bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
@@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
@@ -1579,8 +1579,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
@@ -1599,8 +1599,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
|
As mentioned in this patch, supporting interleaved access with EVL tail folding could be complex. The another simpler approach might be to directly create a VPInterleaveEVLRecipe and extract as much shared code as possible into static functions. I'd like to open a discussion to clarify which direction would be preferable. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to somehow reuse VPVectorEndPointerRecipe
and add a Factor field on it? The two recipes look quite similar
Mel-Chen@9e1df0f |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With the other
auto *GEP = dyn_cast<GetElementPtrInst>( | ||
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would be good to share the logic to determine inbounds from the GEP with similar code above
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, | ||
public VPUnrollPartAccessor<2> { | ||
Type *IndexedTy; | ||
|
||
int64_t Stride; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
b50ca85
to
3b1e184
Compare
; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP0]], 3 | ||
; CHECK-NEXT: [[TMP18:%.*]] = sub nsw i64 2, [[TMP15]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like previously the index types for interleave groups were i32 but now it's determined by datalayout because of the changes in getGEPIndexTy. I don't have much of an opinion on this, was there a reason why it was needed for this PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The original implementation uses i64 as the index type in the GEP.
I think one reason getGEPIndexTy can't simply choose i32 is due to scalable VF in this case;
another reason is that we compute NumElt = Stride * CurrentPart * RunTimeVF, and if the stride is not unit stride, then we have to use the type determined by datalayout.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
: VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC, | ||
ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL), | ||
IndexedTy(IndexedTy) {} | ||
IndexedTy(IndexedTy), Stride(Stride) { | ||
assert(Stride != 0 && "Stride cannot be zero"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stride is currently always negative, right? Can we assert?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, so far it must be negative.
ef6b46a
class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, | ||
public VPUnrollPartAccessor<2> { | ||
Type *IndexedTy; | ||
|
||
/// The constant stride of the pointer computed by this recipe. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// The constant stride of the pointer computed by this recipe. | |
/// The constant stride of the pointer computed by this recipe. The stride will be scaled by IndexedTy. |
Would be good to make clear that the stride is not scaled, but a strid ein terms of IndexedTy unless I am missing something. There might be a clearer way to express this than my suggestion above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
3b1e184
to
5a7c4bf
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM,thanks
; CHECK-NEXT: %26 = getelementptr inbounds i32, ptr %23, i64 %24 | ||
; CHECK-NEXT: %27 = getelementptr inbounds i32, ptr %26, i64 %25 | ||
; CHECK-NEXT: %wide.load = load <vscale x 4 x i32>, ptr %27, align 4 | ||
; CHECK-NEXT: %25 = sub i64 %18, 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unrelated, but the test should probably not check the full debug output
A reverse interleave access is essentially composed of multiple load/store operations with same negative stride, and their addresses are based on the last lane address of member 0 in the interleaved group.
Currently, we already have VPVectorEndPointerRecipe for computing the last lane address of consecutive reverse memory accesses. This patch extends VPVectorEndPointerRecipe to support constant stride and extracts the reverse interleave group address adjustment from VPInterleaveRecipe::execute, replacing it with a VPVectorEndPointerRecipe.
The final goal is to support interleaved accesses with EVL tail folding. Given that VPInterleaveRecipe is large and tightly coupled — combining both load and store, and embedding operations like reverse pointer adjustion (GEP), widen load/store, deinterleave/interleave, and reversal — breaking it down into smaller, dedicated recipes may allow VPlanTransforms::tryAddExplicitVectorLength to lower them into EVL-aware form more effectively.
One foreseeable challenge is that VPlanTransforms::convertToConcreteRecipes currently runs after tryAddExplicitVectorLength, so decomposing VPInterleaveRecipe will likely need to happen earlier in the pipeline to be effective.